rm(list = ls())
gc(verbose = FALSE)

args = commandArgs(trailingOnly = "TRUE")
args

arg1 <- args[1]
arg2 <- args[2]
arg3 <- args[3]

# Define library folder
setwd(arg1)
.libPaths(arg2)
.libPaths(.libPaths()[1])

pkg <- c("nnls", "SuperLearner", "dplyr", "arm", "onehot", "grf", "xgboost", "caret", "data.table",
         "knitr", "kableExtra", "magrittr", "kimisc", "data.table", "readstata13", "DescTools", "scales", "ggplot2")
lapply(pkg, library, character.only = T, lib.loc = arg2)


### loading in the datasets
ihwap_complete = read.csv(paste0(arg1, "/Data/ihwap_boots", arg3,".csv"))

# drop some variables not to be used
ihwap_full = subset(ihwap_complete, select = -c(Household, ExistingConsumption, ProjectedConsumption, end_day, kfold, kfold2, rowid))
ihwap_completeX = subset(ihwap_complete, select = -c(Household, ExistingConsumption, ProjectedConsumption, end_day, kfold, kfold2, rowid, total_mmbtu, treated))

# will train model only with untreated homes
ihwap_full = ihwap_full[which(ihwap_full$treated==0), ]

### Data Frames for the models
# dependent variable
train_y = as.matrix(subset(ihwap_full, select = c("total_mmbtu")))
# independent variables
train_x = subset(ihwap_full, select = -c(total_mmbtu, treated))

######## Setup SuperLearner
# Function to create a list of model names
expandingList <- function(capacity = 10) {
    buffer <- vector('list', capacity)
    length <- 0
    
    methods <- list()
    
    methods$double.size <- function() {
        buffer <<- c(buffer, vector('list', capacity))
        capacity <<- capacity * 2
    }
    
    methods$add <- function(val) {
        if(length == capacity) {
            methods$double.size()
        }
        
        length <<- length + 1
        buffer[[length]] <<- val
    }
    
    methods$as.list <- function() {
        b <- buffer[0:length]
        return(b)
    }
    
    methods
}
SL.library<-expandingList()


### different models to try
# xgboost
xgboost.tune <- list(ntrees = c(1000),
                     max_depth = c(20, 30),
                     minobspernode = c(30),
                     shrinkage = c(0.05))

# Set detailed names = T so we can see the configuration for each function.
# Also shorten the name prefix.
xgboost <- create.Learner("SL.xgboost", tune = xgboost.tune, detailed_names = T, name_prefix = "xgboost")
for(i in 1:length(xgboost$names)){
    SL.library$add(c(xgboost$names[i]))
}

fit.gaselec.SL = SuperLearner(Y=train_y, X=train_x, 
                            SL.library=SL.library$as.list(), family=gaussian(),
                            method="method.NNLS", verbose=TRUE, cvControl = list(V = 5))

predictions = predict(fit.gaselec.SL, ihwap_completeX, onlySL = T)
ihwap_complete$pred_best = unlist(as.numeric(predictions$pred))
ihwap_complete = subset(ihwap_complete, select = c(rowid, Household, end_day, end_month, end_year, total_mmbtu, pred_best))

## exporting predictions
write.csv(ihwap_complete, paste0(arg1, "/Results/Model_Outputs/predpre_ihwap_boots", arg3, ".csv"), row.names = FALSE)
